require(GGally, quietly = TRUE)
require(reshape2, quietly = TRUE)
require(tidyverse, quietly = TRUE, warn.conflicts = FALSE)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(ggfortify)
library(cluster)
library(ggdendro)
library(broom)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(readr)
theme_set(theme_bw())
personagens = read_csv(file = "../dados/film-dialogue/character_list5.csv")
## Parsed with column specification:
## cols(
## script_id = col_integer(),
## imdb_character_name = col_character(),
## words = col_integer(),
## gender = col_character(),
## age = col_character()
## )
personagens = personagens %>%
filter(age != 'NULL') %>%
mutate(age = as.numeric(age))
filmes = read.csv(file = "../dados/film-dialogue/meta_data7.csv")
filmes = filmes %>%
filter(gross != 'NA')
filmes_personagens = merge(filmes, personagens, by="script_id")
mulheres = filmes_personagens %>%
filter(gender == 'f') %>%
group_by(script_id, imdb_id, title, year, gross) %>%
summarise(n_f=n(), words_f=median(words))
homens = filmes_personagens %>%
filter(gender == 'm') %>%
group_by(script_id, imdb_id, title, year, gross) %>%
summarise(n_m=n(), words_m=median(words))
dados = merge(mulheres, homens,
by=c('script_id','imdb_id','title','year','gross'))
duplicados = dados %>%
group_by(title) %>% filter(row_number() > 1)
dados = dados %>%
filter(!(title %in% duplicados$title))
dados = dados %>%
subset(select = -c(script_id,imdb_id,year,gross))
dw = dados
summary(dw)
## title n_f words_f
## (500) Days of Summer : 1 Min. : 1.000 Min. : 101.0
## 10 Things I Hate About You: 1 1st Qu.: 2.000 1st Qu.: 340.6
## 12 Years a Slave : 1 Median : 3.000 Median : 584.5
## 127 Hours : 1 Mean : 2.957 Mean : 827.3
## 1408 : 1 3rd Qu.: 4.000 3rd Qu.:1017.9
## 1492: Conquest of Paradise: 1 Max. :14.000 Max. :7664.0
## (Other) :1572
## n_m words_m
## Min. : 1.000 Min. : 114.0
## 1st Qu.: 4.000 1st Qu.: 350.0
## Median : 6.000 Median : 538.0
## Mean : 6.525 Mean : 759.4
## 3rd Qu.: 8.000 3rd Qu.: 903.0
## Max. :23.000 Max. :5716.0
##
dw %>%
select(-title) %>%
ggpairs()
# Escala de log
dw2 <- dw %>%
mutate_each(funs(log), 2:5)
dw2 %>%
select(-title) %>%
ggpairs()
summary(select(dw2, -title))
## n_f words_f n_m words_m
## Min. :0.0000 Min. :4.615 Min. :0.000 Min. :4.736
## 1st Qu.:0.6931 1st Qu.:5.831 1st Qu.:1.386 1st Qu.:5.858
## Median :1.0986 Median :6.371 Median :1.792 Median :6.288
## Mean :0.9166 Mean :6.392 Mean :1.740 Mean :6.364
## 3rd Qu.:1.3863 3rd Qu.:6.925 3rd Qu.:2.079 3rd Qu.:6.806
## Max. :2.6391 Max. :8.944 Max. :3.135 Max. :8.651
dw2.scaled = dw2 %>%
mutate_each(funs(as.vector(scale(.))), 2:5)
summary(dw2.scaled)
## title n_f words_f
## (500) Days of Summer : 1 Min. :-1.5573 Min. :-2.26001
## 10 Things I Hate About You: 1 1st Qu.:-0.3796 1st Qu.:-0.71409
## 12 Years a Slave : 1 Median : 0.3093 Median :-0.02742
## 127 Hours : 1 Mean : 0.0000 Mean : 0.00000
## 1408 : 1 3rd Qu.: 0.7981 3rd Qu.: 0.67800
## 1492: Conquest of Paradise: 1 Max. : 2.9267 Max. : 3.24527
## (Other) :1572
## n_m words_m
## Min. :-3.14947 Min. :-2.3466
## 1st Qu.:-0.64030 1st Qu.:-0.7295
## Median : 0.09358 Median :-0.1097
## Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.61428 3rd Qu.: 0.6369
## Max. : 2.52572 Max. : 3.2971
##
dw2.scaled %>%
select(-title) %>%
ggpairs()
dists = dw2.scaled %>%
column_to_rownames("title") %>%
dist(method = "euclidean")
hc = hclust(dists, method = "ward.D")
plot(hc, cex = .6)
plot(hc, hang = -1)
n_clusters = 4
rect.hclust(hc, k=n_clusters)
dw2 <- dw2 %>%
mutate(cluster = hc %>%
cutree(k = n_clusters) %>%
as.character())
dw2.scaled <- dw2.scaled %>%
mutate(cluster = hc %>%
cutree(k = n_clusters) %>%
as.character())
dw2.long = melt(dw2.scaled, id.vars = c("title", "cluster"))
hc %>%
cutree(k = n_clusters) %>%
silhouette(dists) %>%
plot(col = RColorBrewer::brewer.pal(n_clusters, "Set2"))
dw2.long %>%
ggplot(aes(x = variable, y = value, group = title, colour = cluster)) +
geom_line(alpha = 0.4) +
facet_wrap(~ cluster)
p <- dw2 %>%
plot_ly(type = 'parcoords',
line = list(color = ~cluster),
dimensions = list(
#list(range = c(1, 4), label = "cluster", values = ~cluster),
list(range = c(0, 4),
label = 'n_f', values = ~n_f),
list(range = c(0, 4),
constraintrange = c(5,6),
label = 'words_f', values = ~words_f),
list(range = c(0, 4),
label = 'n_m', values = ~n_m),
list(range = c(0, 4),
label = 'words_m', values = ~words_m)
)
)
p
dw2.scaled = dw2.scaled %>%
select(-cluster) # Remove o cluster adicionado antes lá em cima via hclust
# O agrupamento de fato:
km = dw2.scaled %>%
select(-title) %>%
kmeans(centers = n_clusters, nstart = 20)
# O df em formato longo, para visualização
dw2.scaled.km.long = km %>%
augment(dw2.scaled) %>% # Adiciona o resultado de km
# aos dados originais dw2.scaled em
# uma variável chamada .cluster
gather(key = "variável",
value = "valor",
-title, -.cluster) # = move para long todas as
# variávies menos repository_language
# e .cluster
dw2.scaled.km.long %>%
ggplot(aes(x = `variável`, y = valor, group = title, colour = .cluster)) +
#geom_point(alpha = 0.2) +
geom_line(alpha = .5) +
facet_wrap(~ .cluster)
autoplot(km, data = dw2.scaled, label = TRUE)
dists = dw2.scaled %>%
select(-title) %>%
dist() # só para plotar silhouetas depois
plot(silhouette(km$cluster, dists), col = RColorBrewer::brewer.pal(n_clusters, "Set2"))
#não funciona...
#table(km %>% pull(cluster))
#km %>% pull(cluster) %>% table()
#summary(dw2.scaled)
p <- km %>%
augment(dw2.scaled) %>%
plot_ly(type = 'parcoords',
line = list(color = ~.cluster,
showScale = TRUE),
dimensions = list(
#list(range = c(1, 4), label = "cluster", values = ~cluster),
list(range = c(-3, 3),
label = 'n_f', values = ~n_f),
list(range = c(-3, 3),
label = 'words_f', values = ~words_f),
list(range = c(-6, 3),
label = 'n_m', values = ~n_m),
list(range = c(-2, 3),
label = 'words_m', values = ~words_m)
)
)
p
Qual seria um bom valor de k? Uma medida comumente usada no kmeans é comparar a distância (quadrática) entre o centro dos clusters e o centro dos dados com a distância (quadrática) entre os pontos todos nos dados e o centro dos dados. Aqui o centro dos dados é um ponto imaginário na média de todas as variáveis. Calculamos a distância do centro de cada cluster para o centro dos dados e multiplicamos pelo número de pontos nesse cluster. Somando esse valor para todos os clusters, temos betweenss abaixo. Se esse valor for próximo do somatório total das distâncias dos pontos para o centro dos dados (totss), os pontos estão próximos do centro de seu cluster. Essa proporção pode ser usada para definir um bom valor de k. Quando ela para de crescer, para de valer à pena aumentar k.
set.seed(123)
explorando_k = tibble(k = 1:15) %>%
group_by(k) %>%
do(
kmeans(select(dw2.scaled, -title),
centers = .$k,
nstart = 20) %>% glance()
)
explorando_k %>%
ggplot(aes(x = k, y = betweenss / totss)) +
geom_line() +
geom_point()